In [5]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket
from matplotlib import pylab
import sys
import yaml
import os
import matplotlib.pyplot as plt
import scvelo as scv
from pybiomart import Server
from tqdm.notebook import tqdm
import itertools
import plotly.express as px
import math

import random

warnings.filterwarnings('ignore')


# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


sc.settings.verbosity = 3         # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (4, 4)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5

Configure paths¶

In [6]:
sc.settings.verbosity = 3         # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (10, 10)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
In [24]:
hostRoot = "-".join(socket.gethostname().split('-')[0:2])


#indir=paths["paths"]["indir"][hostRoot]
outdir="./outdir"
FinaLeaf = "/Progenitors"
externalDataNowaraw = "./data/Badhuri_atlas.h5ad"

#projectBaseDir=paths["paths"]["projectBaseDir"][hostRoot]
adataPath = outdir+"/BadhuriCurated.h5ad"
badhuriMarkers = ["FOS","DDIT3","CEBPD","HOPX","SUB1","HES4","NR2F1","NFIA","FOXO3","FOXP1","NFIB","TCF4","NFIX","HMGB2","SOX11","ZBTB18","BCL11A","ZNF462","ZIC2"]
Radial_cluster_label_aggr = ["radial"]
Radial_geschwindIngested = ["vRG"]
individualsToRemove = ["GW20"]

Re analyses of prebulk¶

In [8]:
adata = sc.read_h5ad(adataPath)
In [9]:
adata.obs.columns
Out[9]:
Index(['development_stage_ontology_term_id', 'individual', 'brain_region',
       'cortical_area', 'lamina', 'cluster_label', 'ngene', 'numi',
       'percent_mito', 'percent_ribo', 'ncount_rna', 'nfeature_rna',
       'tissue_ontology_term_id', 'assay_ontology_term_id',
       'disease_ontology_term_id', 'cell_type_ontology_term_id',
       'ethnicity_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'sex_ontology_term_id', 'cell_type',
       'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity',
       'development_stage', 'S_score', 'G2M_score', 'phase',
       'cluster_label_aggr', 'batch', 'cluster_label_area_age',
       'cluster_label_area', 'GeschIngestedAnno', 'cluster_label_area_layers'],
      dtype='object')
In [10]:
#Select dividing
adata = adata[(adata.obs["GeschIngestedAnno"].isin(Radial_geschwindIngested)) & (adata.obs["cluster_label_aggr"].isin(Radial_cluster_label_aggr))]

adata = adata[~adata.obs["individual"].isin(individualsToRemove)]
adata
Out[10]:
View of AnnData object with n_obs × n_vars = 9106 × 1201
    obs: 'development_stage_ontology_term_id', 'individual', 'brain_region', 'cortical_area', 'lamina', 'cluster_label', 'ngene', 'numi', 'percent_mito', 'percent_ribo', 'ncount_rna', 'nfeature_rna', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'S_score', 'G2M_score', 'phase', 'cluster_label_aggr', 'batch', 'cluster_label_area_age', 'cluster_label_area', 'GeschIngestedAnno', 'cluster_label_area_layers'
    var: 'feature_name', 'feature_reference', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'GeschIngestedAnno_colors', 'X_normalization', 'batch_colors', 'cell_type_colors', 'cluster_label_aggr_colors', 'cluster_label_area_colors', 'cluster_label_area_layers_colors', 'cortical_area_colors', 'hvg', 'individual_colors', 'neighbors', 'pca', 'phase_colors', 'schema_version', 'tissue_colors', 'title', 'umap'
    obsm: 'X_fastpca', 'X_pca', 'X_umap', 'pca_harmony'
    varm: 'PCs'
    layers: 'norm_nolog'
    obsp: 'connectivities', 'distances'
In [11]:
adata.obs[["GeschIngestedAnno","cluster_label_aggr"]]
Out[11]:
GeschIngestedAnno cluster_label_aggr
gw19_2_PFC_AGTTCGATCGACCCAG vRG radial
gw19_2_PFC_GCTTGGGCAATTTCTC vRG radial
gw19_2_Motor_AACCACAAGAGCTGCA vRG radial
gw19_2_Motor_ACGGAAGTCCACGTGG vRG radial
gw19_2_Motor_ATCGCCTGTACAGTCT vRG radial
... ... ...
GW16_V1_GGTTCTCTCATCGCAA vRG radial
GW16_V1_GGTTGTACAACCGCTG vRG radial
GW16_V1_TAAGTCGAGTTGGAGC vRG radial
GW16_V1_TCAAGACGTTGCTCCT vRG radial
GW16_V1_TTGCGTCCATGAGGGT vRG radial

9106 rows × 2 columns

Re process¶

In [12]:
adata = adata.raw.to_adata()
adata.raw = adata.copy()
sc.pp.scale(adata, max_value=20)
sc.tl.pca(adata, svd_solver='arpack')
import scanpy.external as sce
sce.pp.harmony_integrate(adata,key="batch", max_iter_harmony = 20, adjusted_basis = "pca_harmony")
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:04)
2023-04-26 19:22:27,935 - harmonypy - INFO - Iteration 1 of 20
2023-04-26 19:22:32,344 - harmonypy - INFO - Iteration 2 of 20
2023-04-26 19:22:41,196 - harmonypy - INFO - Converged after 2 iterations
In [13]:
sc.pp.neighbors(adata, n_neighbors= 100, n_pcs=50, use_rep="pca_harmony")
sc.tl.umap(adata)
sc.tl.diffmap(adata)
computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:29)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:00)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
    finished (0:00:00)
    eigenvalues of transition matrix
    [1.         0.93052393 0.90235215 0.89221084 0.88726026 0.8771377
     0.86251444 0.8563181  0.8271251  0.81704575 0.8101579  0.8002406
     0.7958678  0.79323226 0.78669226]
    finished: added
    'X_diffmap', diffmap coordinates (adata.obsm)
    'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)

Diffmap¶

In [14]:
sc.pl.diffmap(adata, size = 100,color=["batch","cluster_label_aggr","GeschIngestedAnno","individual","TOP2A"], components=["1,3","1,2","3,4","2,3"], ncols=2)

Leiden¶

In [15]:
sc.tl.leiden(adata, resolution=.5, key_added="leiden")
running Leiden clustering
    finished: found 6 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:13)

UMAP¶

In [16]:
sc.pl.umap(adata, size = 30,color=["batch","cortical_area","cluster_label_aggr","GeschIngestedAnno","individual","TOP2A","leiden","DCX","HOPX","S100B","phase"],  ncols=2, vmin="p1",vmax="p99")

DE¶

In [17]:
sc.tl.rank_genes_groups(adata, groupby="leiden")
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var'
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:01)
In [18]:
pylab.rcParams['figure.figsize'] = (7, 7)

sc.pl.rank_genes_groups(adata, sharey=False)
pylab.rcParams['figure.figsize'] = (9, 9)

Plot per individual¶

In [19]:
for ID in adata.obs.individual.unique():
    sc.pl.umap(adata,color="individual", groups=[ID], size = 30, na_color="black")

Save BC list¶

In [27]:
if not os.path.isdir(outdir+FinaLeaf):
    os.mkdir(outdir+FinaLeaf)
In [28]:
pd.Series(adata.obs_names).to_csv(outdir+FinaLeaf+"/Badhuri_progenitors_prebulk.Curated.barcodelist.tsv", sep="\t")
In [ ]: